% Make dataset

clear all;
small = 1.0e-10;

% -- File Directories   
figdir = 'fig/';
outdir = 'out/';
matdir = 'mat/';

% Load Data for future use
fstr = [matdir 'year_linked_all_pwt91']; load(fstr);
fstr = [matdir 'country_linked_all_pwt91']; load(fstr);
fstr = [matdir 'country_code_linked_all_pwt91']; load(fstr);
fstr = [matdir 'yp_linked_all_pwt91']; load(fstr);
fstr = [matdir 'pop_linked_all_pwt91']; load(fstr);
calvec = year_linked_all;
n_countries = size(yp_linked_all,2);
n_t = size(calvec,1);

% Some cuts at the data
% Compute number of years of data at end of sample;
yp_linked_nyears_all = zeros(n_countries,1);
for ic = 1:n_countries;
    tmp = flipud(yp_linked_all(:,ic));
    it = 0;
    jj = 0;
    while jj == 0;
        it = it+1;
        if isnan(tmp(it)) == 1
            yp_linked_nyears_all(ic) = it-1;
            jj = 1;
        end
        if it == n_t
            yp_linked_nyears_all(ic) = n_t;
            jj = 1;
        end
    end;
%     ss = char(country_linked_all(ic));
%     fprintf([ss '\n']);
%     fprintf('%4i \n\n',yp_linked_nyears_all(ic));
end;

% Compute Some numbers
iyears = 50;
ipop = 3;
ii1 = (yp_linked_nyears_all >= iyears);
ii2 = (pop_linked_all(end,:)' >= ipop);
ii = ii1.*ii2;

yp = yp_linked_all(:,ii==1);
pop = pop_linked_all(:,ii==1);
country = country_linked_all(ii==1);
country_code = country_code_linked_all(ii==1);
yp_nyears = yp_linked_nyears_all(ii==1);
nc = size(yp,2);

% Compute Total Output at end of sample
tmp1 = packr(yp_linked_all(end,:)');
tmp2 = packr(pop_linked_all(end,:)');
y = yp(end,:)*pop(end,:)';
yall = tmp1'*tmp2;
p = sum(pop(end,:)');
pall = sum(pop_linked_all(end,:)');
fprintf('Shares of GDP and Pop at last date: %5.2f  %5.2f \n',[y/yall p/pall]);

plot(calvec,yp,'LineWidth',2)
set(gca, 'YScale', 'log');
xlim([1900 2020]);
set(gcf, 'Position', get(0, 'Screensize'));  % Full Screen
close(gcf);

% Save matlab files with data
fstr = [matdir 'calvec']; save(fstr,'calvec');
fstr = [matdir 'country']; save(fstr,'country');
fstr = [matdir 'country_code']; save(fstr,'country_code');
fstr = [matdir 'yp']; save(fstr,'yp');
fstr = [matdir 'pop']; save(fstr,'pop');

% Save data as CSV file
outfile_name = [outdir 'yp.csv'];
fileID = fopen(outfile_name,'w');
fprintf(fileID,'Year,');
for i = 1:nc;
    ss = char(country_code(i));
    fprintf(fileID,ss);
    if i < nc
        fprintf(fileID,',');
    else
        fprintf(fileID,'\n');
    end
end;
fprintf(fileID,',');
for i = 1:nc;
    ss = char(country(i));
    fprintf(fileID,ss);
    if i < nc
        fprintf(fileID,',');
    else
        fprintf(fileID,'\n');
    end
end;
for t = 1:n_t;
    fprintf(fileID,'%5i,',calvec(t));
    prtmat_comma(yp(t,:),fileID,'%12.2f','\n');
end;

% Save data as CSV file
outfile_name = [outdir 'pop.csv'];
fileID = fopen(outfile_name,'w');
fprintf(fileID,'Year,');
for i = 1:nc;
    ss = char(country_code(i));
    fprintf(fileID,ss);
    if i < nc
        fprintf(fileID,',');
    else
        fprintf(fileID,'\n');
    end
end;
fprintf(fileID,',');
for i = 1:nc;
    ss = char(country(i));
    fprintf(fileID,ss);
    if i < nc
        fprintf(fileID,',');
    else
        fprintf(fileID,'\n');
    end
end;
for t = 1:n_t;
    fprintf(fileID,'%5i,',calvec(t));
    prtmat_comma(pop(t,:),fileID,'%12.2f','\n');
end;

% List countries that are excluded
outfile_name = [outdir 'excluded_countries.csv'];
fileID = fopen(outfile_name,'w');
fprintf(fileID,'Country,Country,GDP(2014),Pop(2014),nyears,ii1,ii2,ii3 \n');
for i = 1:size(ii,1);
    if ii(i) == 0
        str1 = char(country_code_linked_all(i));
        str2 = char(country_linked_all(i));
        fprintf(fileID,[str1 ',' str2 ',']);
        fprintf(fileID,'%8.1f ,',yp_linked_all(end,i));
        fprintf(fileID,'%8.1f ,',pop_linked_all(end,i));
        jj = isnan(yp_linked_all(:,i));
        nnn = sum(jj==0);
        fprintf(fileID,'%3i ,',nnn);
        fprintf(fileID,'%2i, %2i \n',[ii1(i) ii2(i)]);
    end
end;

% Compute average population for excluded countries
fprintf('Number excluded: %3i \n',sum(ii==0));
fprintf('Number excluded because of short samples: %3i \n',sum(ii1==0));
fprintf('Number excluded because of small population: %3i \n',sum(ii2==0));
fprintf('Number excluded because of both: %3i \n',sum(ii1+ii2==0));
pop_ex = pop_linked_all(end,ii2'==0);
tmp = mean(pop_ex');
fprintf('average population of countries excluded by pop %5.2f \n',tmp);

% Compute Number of Countries for each date 
outfile_name = [outdir 'number_of_countries_by_date.csv'];
fileID = fopen(outfile_name,'w');
fprintf(fileID,'Year, Number \n');
yp_missing = isnan(yp);
for t = 1:n_t;
    fprintf(fileID,'%4i ,',year_linked_all(t));
    fprintf(fileID,'%4i \n',sum(yp_missing(t,:)' == 0));
end;



